1 데이터와 패키지 불러오기

1.1 사용할 패키지와 소스

1.2 데이터 불러오기

dat = read.csv("/Users/iseunghun/course/결측데이터분석/project2/wine.data", header = F)
set.seed(1)
#V1 : Cluster Label (True Value) [59 vs 71 vs 48]
#V2 : Alcohol : 알코올 도수
#V3 : Malic acid : 말산
#V4 : Ash 회분
#V5 : Alcalinity of ash : 회분의 알칼리도
#V6 : Magnesium : 마그네슘
#V7 : Total phenols 페놀 총량
#V8 : Flavanoids : 플라보노이드   
#V9 : Nonflavanoid phenols : 비 플라보노이드 페놀
#V10: Proanthocyanins : 프로안토시아니딘
#V11: Color intensity : 색채 강도
#V12: Hue : 색조
#V13: OD280/OD313 of diluted wines : 희석된 와인의 OD280/OD313 비율
#V14: Proline : 프롤린

# label 은 제거 
dat2<-dat[,-1]

2 create missing

어떤 그룹에 속하느냐에 따라서 missing 비율이 달라진다고 가정한다. 따라서 NMAR 가정에 가깝다. 결측을 발생시킬 변수들은 다음과 같이 그룹마다 차이가 큰 변수를 채택

ggplot(dat)+
  geom_boxplot(mapping = aes(x=V7,y=as.factor(V1), color = as.factor(V1)))

ggplot(dat)+
  geom_boxplot(mapping = aes(x=V8,y=as.factor(V1), color = as.factor(V1)))

ggplot(dat)+
  geom_boxplot(mapping = aes(x=V12,y=as.factor(V1), color = as.factor(V1)))

ggplot(dat)+
  geom_boxplot(mapping = aes(x=V13,y=as.factor(V1), color = as.factor(V1)))

## get PCs

PCA를 진행한 다음 가장 클러스터 레이벨과 연관성이 깊은 변수를 채택하여 결측발생에 이해한다. 이과정에서 MAR 의 성격이 다소 포함되지만 주로 클러스터 레이벨이 영향을 주고 있기 때문에 MNAR 에 가깝다고 할 수 있다.

pca = princomp(scale(dat[,-c(7,8,12,13)]))
pca$loadings
## 
## Loadings:
##     Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
## V1   0.462  0.167  0.160  0.319  0.162  0.262                       0.728 
## V2  -0.323  0.401  0.310 -0.121                0.168 -0.698 -0.232  0.193 
## V3   0.234  0.343  0.163  0.371 -0.482 -0.599 -0.188  0.146               
## V4          0.448 -0.544 -0.339        -0.151  0.142  0.337 -0.433  0.209 
## V5   0.357  0.207 -0.499                       0.345 -0.406  0.483 -0.209 
## V6  -0.257  0.247 -0.327  0.495  0.542        -0.436 -0.161               
## V9   0.347  0.213        -0.483         0.225 -0.717 -0.152               
## V10 -0.322        -0.304  0.277 -0.652  0.461 -0.216                0.173 
## V11         0.533  0.314  0.140         0.509  0.197  0.311        -0.444 
## V14 -0.456  0.229        -0.238                       0.253  0.714  0.311 
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## SS loadings       1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0
## Proportion Var    0.1    0.1    0.1    0.1    0.1    0.1    0.1    0.1    0.1
## Cumulative Var    0.1    0.2    0.3    0.4    0.5    0.6    0.7    0.8    0.9
##                Comp.10
## SS loadings        1.0
## Proportion Var     0.1
## Cumulative Var     1.0

2.1 13% missing data

2.1.1 create missing

##### 13% 미싱 그래프 (Red : Observed, Blue : Missing)
pca = princomp(scale(dat[,-c(7,8,12,13)]))

s <- pca$scores
dat13<-dat[,-1]
set.seed(1)
index1 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))
set.seed(2)
index2 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))
set.seed(5)
index3 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))
set.seed(7)
index4 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))


dat13$V7[s[,10] >= quantile(s[,10],0.85) & index1 == 1]<-NA
dat13$V8[s[,10] >= quantile(s[,10],0.85) & index2 == 1]<-NA
dat13$V12[s[,10] >= quantile(s[,10],0.85) & index3 == 1]<-NA
dat13$V13[s[,10] >= quantile(s[,10],0.85) & index4 == 1]<-NA

2.1.2 missing prop per cluster

nrow(dat[rowSums(is.na(dat13))!=0 & dat$V1 == 1,])/nrow(dat[dat$V1 == 1,])
## [1] 0.08474576
nrow(dat[rowSums(is.na(dat13))!=0 & dat$V1 == 2,])/nrow(dat[dat$V1 == 2,])
## [1] 0.1126761
nrow(dat[rowSums(is.na(dat13))!=0 & dat$V1 == 3,])/nrow(dat[dat$V1 == 3,])
## [1] 0.2916667

2.1.3 missing prop per var

apply(is.na(dat13), 2, sum)/nrow(dat)
##        V2        V3        V4        V5        V6        V7        V8        V9 
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.1348315 0.1348315 0.0000000 
##       V10       V11       V12       V13       V14 
## 0.0000000 0.0000000 0.1348315 0.1404494 0.0000000

2.1.4 missing data pattern

md.pattern(dat13)

##     V2 V3 V4 V5 V6 V9 V10 V11 V14 V7 V8 V12 V13   
## 151  1  1  1  1  1  1   1   1   1  1  1   1   1  0
## 1    1  1  1  1  1  1   1   1   1  1  1   0   0  2
## 1    1  1  1  1  1  1   1   1   1  1  0   0   1  2
## 1    1  1  1  1  1  1   1   1   1  1  0   0   0  3
## 2    1  1  1  1  1  1   1   1   1  0  1   0   0  3
## 3    1  1  1  1  1  1   1   1   1  0  0   1   0  3
## 1    1  1  1  1  1  1   1   1   1  0  0   0   1  3
## 18   1  1  1  1  1  1   1   1   1  0  0   0   0  4
##      0  0  0  0  0  0   0   0   0 24 24  24  25 97

2.2 27% missing

##### 27% 미싱 그래프 (Red : Observed, Blue : Missing)
set.seed(1)
index1 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))
set.seed(2)
index2 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))
set.seed(5)
index3 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))
set.seed(7)
index4 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))


dat27<-dat[,-1]

dat27$V7[s[,10] >= quantile(s[,10],0.70) & index1 == 1]<-NA
dat27$V8[s[,10] >= quantile(s[,10],0.70) & index2 == 1]<-NA
dat27$V12[s[,10] >= quantile(s[,10],0.70) & index3 == 1]<-NA
dat27$V13[s[,10] >= quantile(s[,10],0.70) & index4 == 1]<-NA

2.2.1 missing prop per cluster

nrow(dat[rowSums(is.na(dat27))!=0 & dat$V1 == 1,])/nrow(dat[dat$V1 == 1,])
## [1] 0.1864407
nrow(dat[rowSums(is.na(dat27))!=0 & dat$V1 == 2,])/nrow(dat[dat$V1 == 2,])
## [1] 0.2957746
nrow(dat[rowSums(is.na(dat27))!=0 & dat$V1 == 3,])/nrow(dat[dat$V1 == 3,])
## [1] 0.4583333

2.2.2 missing prop per var

apply(is.na(dat27), 2, sum)/nrow(dat)
##        V2        V3        V4        V5        V6        V7        V8        V9 
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.2696629 0.2640449 0.0000000 
##       V10       V11       V12       V13       V14 
## 0.0000000 0.0000000 0.2696629 0.2865169 0.0000000

2.2.3 missing data patern

md.pattern(dat27)

##     V2 V3 V4 V5 V6 V9 V10 V11 V14 V8 V7 V12 V13    
## 124  1  1  1  1  1  1   1   1   1  1  1   1   1   0
## 2    1  1  1  1  1  1   1   1   1  1  1   0   0   2
## 5    1  1  1  1  1  1   1   1   1  1  0   0   0   3
## 1    1  1  1  1  1  1   1   1   1  0  1   0   1   2
## 3    1  1  1  1  1  1   1   1   1  0  1   0   0   3
## 6    1  1  1  1  1  1   1   1   1  0  0   1   0   3
## 2    1  1  1  1  1  1   1   1   1  0  0   0   1   3
## 35   1  1  1  1  1  1   1   1   1  0  0   0   0   4
##      0  0  0  0  0  0   0   0   0 47 48  48  51 194

스케일에 차이가 많이 나기 때문에, scale 한 다음에 k-means를 시도해야 함

dat2_scaled = data.frame(scale(dat2))
dat13_scaled = data.frame(scale(dat13))
dat27_scaled = data.frame(scale(dat27))

3 mssing & obsereved 변수의 분포 (MCAR 아님을 확인)

missing index 지정해주는 코드

missing_V7_13 <- ifelse(is.na(dat13$V7)==0,"Observed","Missing")
missing_V8_13 <- ifelse(is.na(dat13$V8)==0,"Observed","Missing")
missing_V12_13 <- ifelse(is.na(dat13$V12)==0,"Observed","Missing")
missing_V13_13 <- ifelse(is.na(dat13$V13)==0,"Observed","Missing")

missing_V7_27 <- ifelse(is.na(dat27$V7)==0,"Observed","Missing")
missing_V8_27 <- ifelse(is.na(dat27$V8)==0,"Observed","Missing")
missing_V12_27 <- ifelse(is.na(dat27$V12)==0,"Observed","Missing")
missing_V13_27 <- ifelse(is.na(dat27$V13)==0,"Observed","Missing")

4 13% missing (클러스터 고려하지 않음)

temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_13
colnames(temp7) = c("Total_phenols","missing")
ggplot(temp7,mapping = aes(x= Total_phenols,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_13
colnames(temp8) = c("Flavanoids","missing")
ggplot(temp8,mapping = aes(x= Flavanoids,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_13
colnames(temp12) = c("Hue","missing")
ggplot(temp12,mapping = aes(x= Hue,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_13
colnames(temp13) = c("ratio","missing")
ggplot(temp13,mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

4.1 missing 27% (클러스터 고려하지 않음)

temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_27
colnames(temp7) = c("Total_phenols","missing")
ggplot(temp7,mapping = aes(x= Total_phenols,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_27
colnames(temp8) = c("Flavanoids","missing")
ggplot(temp8,mapping = aes(x= Flavanoids,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_27
colnames(temp12) = c("Hue","missing")
ggplot(temp12,mapping = aes(x= Hue,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_27
colnames(temp13) = c("ratio","missing")
ggplot(temp13,mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

K-means 알고리듬의 핵심은 적절한 centroid를 찾을 수 있냐 이기 때문에 클러스터에 대한 정보를 이용하는 것이 더 타당하다. 클러스터를 고려치 않을시에는 오히려 MCAR 이어 보이지만 mean imputation을 진행하여 클러스터링을 진행할시 가장 성능이 좋지 않을것이다.

5 13% missing (클러스터 고려)

5.0.1 Cluster1

temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_13
temp7$V1 <- dat$V1
colnames(temp7) = c("ratio","missing","V1")
ggplot(temp13[temp7$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_13
temp8$V1 <- dat$V1
colnames(temp8) = c("ratio","missing","V1")
ggplot(temp8[temp8$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_13
temp12$V1 <- dat$V1
colnames(temp12) = c("ratio","missing","V1")
ggplot(temp12[temp12$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_13
temp13$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

5.0.2 Cluster2

temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_13
temp7$V1 <- dat$V1
colnames(temp7) = c("ratio","missing","V1")
ggplot(temp13[temp7$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_13
temp8$V1 <- dat$V1
colnames(temp8) = c("ratio","missing","V1")
ggplot(temp8[temp8$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_13
temp12$V1 <- dat$V1
colnames(temp12) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_13
temp13$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

5.0.3 Cluster3

temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_13
temp7$V1 <- dat$V1
colnames(temp7) = c("ratio","missing","V1")
ggplot(temp13[temp7$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_13
temp8$V1 <- dat$V1
colnames(temp8) = c("ratio","missing","V1")
ggplot(temp8[temp8$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_13
temp12$V1 <- dat$V1
colnames(temp12) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_13
temp13$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

6 27% missing (클러스터 고려)

6.0.1 Cluster1

temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_27
temp7$V1 <- dat$V1
colnames(temp7) = c("ratio","missing","V1")
ggplot(temp13[temp7$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_27
temp8$V1 <- dat$V1
colnames(temp8) = c("ratio","missing","V1")
ggplot(temp8[temp8$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_27
temp12$V1 <- dat$V1
colnames(temp12) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_27
temp13$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

6.0.2 Cluster2

temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_27
temp7$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp7$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_27
temp8$V1 <- dat$V1
colnames(temp8) = c("ratio","missing","V1")
ggplot(temp8[temp8$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_27
temp12$V1 <- dat$V1
colnames(temp12) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_27
temp13$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

6.0.3 Cluster3

temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_27
temp7$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp7$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_27
temp8$V1 <- dat$V1
colnames(temp8) = c("ratio","missing","V1")
ggplot(temp8[temp8$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_27
temp12$V1 <- dat$V1
colnames(temp12) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_27
temp13$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
  geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)

클러스터 레이벨이 3>2>1 순으로, 그리고 missing 비율이 높아질수록 분포의 왜곡이 심해짐을 알 수 있다.

7 데이터 탐구 (EDA)

7.1 상관계수

corrplot(cor(dat2, method = "pearson"),method="ellipse")

7.2 변수들의 분산 확인

apply(dat2,2,var)
##           V2           V3           V4           V5           V6           V7 
## 6.590623e-01 1.248015e+00 7.526464e-02 1.115269e+01 2.039893e+02 3.916895e-01 
##           V8           V9          V10          V11          V12          V13 
## 9.977187e-01 1.548863e-02 3.275947e-01 5.374449e+00 5.224496e-02 5.040864e-01 
##          V14 
## 9.916672e+04

스케일에 차이가 많이 나기 때문에, scale 한 다음에 k-means를 시도해야 함

dat2_scaled = data.frame(scale(dat2))
dat13_scaled = data.frame(scale(dat13))
dat27_scaled = data.frame(scale(dat27))

7.3 변수 정규성 가정 만족 여부

###V2

ggplot(dat2_scaled)+
    geom_histogram(mapping = aes(x=V2), binwidth= 0.3)

shapiro.test(dat2_scaled$V2)
## 
##  Shapiro-Wilk normality test
## 
## data:  dat2_scaled$V2
## W = 0.9818, p-value = 0.02005

###V3

ggplot(dat2_scaled)+
    geom_histogram(mapping = aes(x=V3))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

shapiro.test(dat2_scaled$V3)
## 
##  Shapiro-Wilk normality test
## 
## data:  dat2_scaled$V3
## W = 0.88878, p-value = 2.946e-10

###V4

ggplot(dat2_scaled)+
    geom_histogram(mapping = aes(x=V4), binwidth= 0.3)

shapiro.test(dat2_scaled$V4)
## 
##  Shapiro-Wilk normality test
## 
## data:  dat2_scaled$V4
## W = 0.98395, p-value = 0.03868

###V5

ggplot(dat2_scaled)+
    geom_histogram(mapping = aes(x=V5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

shapiro.test(dat2_scaled$V5)
## 
##  Shapiro-Wilk normality test
## 
## data:  dat2_scaled$V5
## W = 0.99023, p-value = 0.2639

###V6

ggplot(dat2_scaled)+
    geom_histogram(mapping = aes(x=V6), binwidth= 0.3)

shapiro.test(dat2_scaled$V6)
## 
##  Shapiro-Wilk normality test
## 
## data:  dat2_scaled$V6
## W = 0.93833, p-value = 6.346e-07

###V7

ggplot(dat2_scaled)+
    geom_histogram(mapping = aes(x=V7), binwidth= 0.3)

shapiro.test(dat2_scaled$V7)
## 
##  Shapiro-Wilk normality test
## 
## data:  dat2_scaled$V7
## W = 0.97668, p-value = 0.004395

###V8

ggplot(dat2_scaled)+
    geom_histogram(mapping = aes(x=V8))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

shapiro.test(dat2_scaled$V8)
## 
##  Shapiro-Wilk normality test
## 
## data:  dat2_scaled$V8
## W = 0.95453, p-value = 1.679e-05

###V9

ggplot(dat2_scaled)+
    geom_histogram(mapping = aes(x=V9))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

shapiro.test(dat2_scaled$V9)
## 
##  Shapiro-Wilk normality test
## 
## data:  dat2_scaled$V9
## W = 0.96252, p-value = 0.0001055

###V10

ggplot(dat2_scaled)+
    geom_histogram(mapping = aes(x=V10), binwidth= 0.3)

shapiro.test(dat2_scaled$V10)
## 
##  Shapiro-Wilk normality test
## 
## data:  dat2_scaled$V10
## W = 0.98072, p-value = 0.01445

###V11

ggplot(dat2_scaled)+
    geom_histogram(mapping = aes(x=V11), binwidth= 0.3)

shapiro.test(dat2_scaled$V11)
## 
##  Shapiro-Wilk normality test
## 
## data:  dat2_scaled$V11
## W = 0.94032, p-value = 9.229e-07

###V12

ggplot(dat2_scaled)+
    geom_histogram(mapping = aes(x=V12), binwidth= 0.3)

shapiro.test(dat2_scaled$V12)
## 
##  Shapiro-Wilk normality test
## 
## data:  dat2_scaled$V12
## W = 0.98134, p-value = 0.01743

###V13

ggplot(dat2_scaled)+
    geom_histogram(mapping = aes(x=V13), binwidth= 1)

shapiro.test(dat2_scaled$V13)
## 
##  Shapiro-Wilk normality test
## 
## data:  dat2_scaled$V13
## W = 0.94505, p-value = 2.316e-06

###V14

ggplot(dat2_scaled)+
    geom_histogram(mapping = aes(x=V14))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

shapiro.test(dat2_scaled$V14)
## 
##  Shapiro-Wilk normality test
## 
## data:  dat2_scaled$V14
## W = 0.93119, p-value = 1.741e-07

standardizing 하고나서, 정규성은 만족하지 않음. 변수변환도 시도해봤으나 큰 도움이 되지 못했다. 오히려 정규성 가정을 만족하지 않는 변수를 imputation 등을 하여 클러스터링을 했을때 성능이 어떻게 되는지 확인해 보는게 더 의미 있다고 판단함.

8 CLUSTERING

kmeans 알고리듬은 초깃값에 따라 성능에 영향을 많이 받는 문제가 있어 초깃값을 10개를사용했다.ㅏ

8.1 K-means (complete data)

true_label = dat$V1

complete <- kmeans(dat2_scaled, centers = 3, nstart =10 )
print(adjustedRandIndex(complete$cluster, true_label))
## [1] 0.897495
stat <- cluster.stats(d = dist((dat2_scaled)), complete$cluster, dat$V1)
stat$dunn
## [1] 0.2322567

8.2 K-means refence model

fully observed variable 만을 이용하여 클러스터링 진행

8.2.1 dat13 & dat 27 동일

# use only complete variable 
F_o = dat13[,colSums(is.na(dat13))==0]

init <- kmeans(scale(F_o), centers = 3, iter.max = 100000, nstart = 10)
print(adjustedRandIndex(init$cluster, true_label))
## [1] 0.7310778
stat <- cluster.stats(d = dist((dat2_scaled)), init$cluster, dat$V1)
stat$dunn
## [1] 0.1769191

최소한 0.731 보다는 높아야 함

먼저, 사전에 결측대체를 진행하지 않는 알고리듬의 성능을 보고자 한다.

8.3 KSC

8.3.1 dat13_scaled

가장 적합한 w (weight를 찾아줘야함) 이건 그냥 예시

print(adjustedRandIndex(test$membership, true_label))
## [1] 0.8350791
stat$dunn
## [1] 0.1769191

8.3.2 dat27_scaled

가장 적합한 w (weight 찾는건 simulation에서 다룸)

print(adjustedRandIndex(test$membership, true_label))
## [1] 0.8350791
stat$dunn
## [1] 0.154815

8.4 K-POD

8.4.1 dat13_scaled

B1<-kpod(as.matrix(dat13_scaled),3)
stat <- cluster.stats(d = dist((dat2_scaled)),B1$cluster, dat$V1)
stat$wb
## [1] 0.6717521
stat$dunn
## [1] 0.1601327
adjustedRandIndex(B1$cluster, true_label)
## [1] 0.7920856

8.4.2 data27_scaled

B1<-kpod(as.matrix(dat27_scaled),3)
stat <- cluster.stats(d = dist((dat2_scaled)),B1$cluster, dat$V1)
stat$wb
## [1] 0.6735558
stat$dunn
## [1] 0.1620794
adjustedRandIndex(B1$cluster, true_label)
## [1] 0.7787078

8.5 ClusterImpute

8.5.1 data13_scaled

A1<-ClustImpute(dat13_scaled,3)
stat <- cluster.stats(d = dist((dat2_scaled)),A1$cluster, dat$V1)
stat$wb
## [1] 0.6600923
stat$dunn
## [1] 0.1620794
adjustedRandIndex(A1$clusters, true_label) 
## [1] 0.8486782

8.5.2 data27_scaled

A1<-ClustImpute(dat27_scaled,3)
stat <- cluster.stats(d = dist((dat2_scaled)),A1$cluster, dat$V1)
stat$wb
## [1] 0.6600923
stat$dunn
## [1] 0.1620794
adjustedRandIndex(A1$clusters, true_label) 
## [1] 0.8486782

다음으로는 사전에 결측대체를 진행하고 클러스터링을 진행하는 방법의 성능을 보고자 한다. 이때 관건은 얼마나 그럴듯한 값으로 대체를 하냐일것이다. 이때, 표준화를 거친 후에 결측대체를 진행한 경우와, 표준화를 거치기 전에 결측대체를 진행하고 클러스터링을 실시한 경우로 나누어 생각했다. 대체로 전자의 경우 성능이 좋았다.

8.6 K-NN Imputation

8.6.1 dat13_scaled

dat13_scaled1<-knnImputation(dat13_scaled, k = 1, scale = T, meth = "weighAvg", distData = NULL)
dat13_scaled15<-knnImputation(dat13_scaled, k = 15, scale = T, meth = "weighAvg", distData = NULL)

S11<-kmeans(dat13_scaled1,3 , nstart =10);  #S11$cluster
S11_stat <- cluster.stats(d = dist((dat2_scaled)), S11$cluster, dat$V1)
print(adjustedRandIndex(S11$cluster,true_label))
## [1] 0.8486782
S11_stat$dunn
## [1] 0.1620794
S12<-kmeans(dat13_scaled15,3, nstart = 10 );    #S12$cluster
S12_stat <- cluster.stats(d = dist((dat2_scaled)), S12$cluster, dat$V1)
print(adjustedRandIndex(S12$cluster,true_label))
## [1] 0.8652458
S11_stat$dunn
## [1] 0.1620794

8.6.2 dat27_scaled

dat27_scaled1<-knnImputation(dat27_scaled, k = 1, scale = T, meth = "weighAvg", distData = NULL)
dat27_scaled15<-knnImputation(dat27_scaled, k = 15, scale = T, meth = "weighAvg", distData = NULL)

S13<-kmeans(dat27_scaled1,3, nstart = 10);  #S13$cluster
S13_stat <- cluster.stats(d = dist((dat2_scaled)), S13$cluster, dat$V1)
print(adjustedRandIndex(S13$cluster,true_label))
## [1] 0.7423645
S13_stat$dunn
## [1] 0.1324693
S14<-kmeans(dat27_scaled15,3, nstart = 10 );    #S14$cluster
S14_stat <- cluster.stats(d = dist((dat2_scaled)), S14$cluster, dat$V1)
print(adjustedRandIndex(S14$cluster,true_label))
## [1] 0.8353221
S14_stat$dunn
## [1] 0.1772105

8.6.3 dat13 not scaled imputation

dat13_1<-knnImputation(dat13, k = 1, scale = T, meth = "weighAvg", distData = NULL)
dat13_15<-knnImputation(dat13, k = 15, scale = T, meth = "weighAvg", distData = NULL)

S11<-kmeans(scale(dat13_1),3, nstart = 10); #S11$cluster
print(adjustedRandIndex(S11$cluster,true_label))
## [1] 0.8653209
S11_stat$dunn
## [1] 0.1620794
S12<-kmeans(scale(dat13_15),3, nstart = 10);    #S12$cluster
print(adjustedRandIndex(S12$cluster,true_label))
## [1] 0.8483196
S12_stat$dunn
## [1] 0.2322567

8.6.4 dat27 not scaled impu + scale

dat27_1<-knnImputation(dat27, k = 1, scale = T, meth = "weighAvg", distData = NULL)
dat27_15<-knnImputation(dat27, k = 15, scale = T, meth = "weighAvg", distData = NULL)

S13<-kmeans(scale(dat27_1),3, nstart =10);  #S13$cluster
print(adjustedRandIndex(S13$cluster,true_label))
## [1] 0.7423645
S13_stat$dunn
## [1] 0.1324693
S14<-kmeans(scale(dat27_15),3, nstart = 10 );   #S14$cluster
print(adjustedRandIndex(S14$cluster,true_label))
## [1] 0.8186897
S14_stat$dunn
## [1] 0.1772105

8.7 Mean-Imputation

8.7.1 dat13_scaled

print(adjustedRandIndex(S21$cluster,true_label))
## [1] 0.7763463
S21_stat$dunn
## [1] 0.1520128

8.7.2 dat27_scaled

S22_stat$dunn
## [1] 0.1605927
print(adjustedRandIndex(S22$cluster,true_label))
## [1] 0.7787078

8.7.3 dat13 not scaled impu + scale

S21_stat$dunn
## [1] 0.1537044
print(adjustedRandIndex(S21$cluster,true_label))
## [1] 0.6989163

8.7.4 dat27 not scaled impu + scale

S22_stat$dunn
## [1] 0.1605927
print(adjustedRandIndex(S22$cluster,true_label))
## [1] 0.6456088

8.8 Unconditional Hot-Deck

8.8.1 dat13_scaled

print(adjustedRandIndex(S31$cluster,true_label))
## [1] 0.7209152
S31_stat$dunn
## [1] 0.1537044

8.8.2 dat27_scaled

S32_stat$dunn
## [1] 0.1300351
print(adjustedRandIndex(S32$cluster,true_label))
## [1] 0.8376253

8.8.3 dat13 not scale impu + scale

print(adjustedRandIndex(S31$cluster,true_label))
## [1] 0.7920856
S31_stat$dunn
## [1] 0.1520128

8.8.4 dat27 not scale impu + scale

print(adjustedRandIndex(S32$cluster,true_label))
## [1] 0.7671606
S32_stat$dunn
## [1] 0.1288482

8.9 MI with pmm

MI 의 경우 cluster label 이 매번 임의로 label 되는 점을 해결하기 위해서 함수를 작성

#모드를 저장하는 함수 
getmode <- function(v) {
    uniqv <- unique(v)
    uniqv[which.max(tabulate(match(v, uniqv)))]
}

# KNN결과 보팅하는 함수 
vote <- function(multiple_imputation_set){
    #결과를 저장하기 위한 매트릭스
    result = matrix(c(rep(0, nrow(dat13_scaled)*10)), nrow = 10)
    group1 = 1:59
    group2 = 60:130
    group3 = 131:178
    for (i in 1:10){
        data = complete(multiple_imputation_set, i)
        init <- kmeans(scale(data), centers = 3, iter.max = 100000, nstart = 10)
        result[i,] = init$cluster
    
        ## 레이벨 뽑기
        index1 = getmode(result[i,group1])
        index2 = getmode(result[i,group2])
        index3 = getmode(result[i,group3])
    
        ## 레이벨만 바꿔주기
        result[i,result[i,]==index1] = 11
        result[i,result[i,]==index2] = 12
        result[i,result[i,]==index3] = 13
        result[i,] = result[i,]-10
    }
    final_result = apply(result,2,getmode)
    return(final_result)
}

8.9.1 dat13_scaled

ppm 방법 사용해서 imputation set 10개를 만듬

adjustedRandIndex(vote(pmm), true_label)
## [1] 0.8639879
print(stat$dunn)
## [1] 0.1620794

8.9.2 dat27_scaled

adjustedRandIndex(vote(pmm), true_label)
## [1] 0.8188574
stat$dunn
## [1] 0.1830305

8.9.3 dat13_scaled

ppm 방법 사용해서 imputation set 10개를 만듬

adjustedRandIndex(vote(pmm), true_label)
## [1] 0.8640004
stat$dunn
## [1] 0.203264

8.9.4 dat27_scaled

adjustedRandIndex(vote(pmm), true_label)
## [1] 0.800492
stat$dunn
## [1] 0.2204796

8.10 MI with cart

8.10.1 dat13_scaled

adjustedRandIndex(vote(cart), true_label)
## [1] 0.8321902
stat$dunn
## [1] 0.1538608

8.10.2 dat27_scaled

adjustedRandIndex(vote(cart), true_label)
## [1] 0.7566207
stat$dunn
## [1] 0.1520128

8.10.3 dat13

adjustedRandIndex(vote(cart), true_label)
## [1] 0.7870769
stat$dunn
## [1] 0.1520128

8.10.4 dat27

adjustedRandIndex(vote(cart), true_label)
## [1] 0.7881732
stat$dunn
## [1] 0.1737495

8.11 MI with rf

8.11.1 dat13_scaled

adjustedRandIndex(vote(rf), true_label)
## [1] 0.8637748
stat$dunn
## [1] 0.1620794

8.11.2 dat27_scaled

adjustedRandIndex(vote(rf), true_label)
## [1] 0.8651097
stat$dunn
## [1] 0.2322567

8.11.3 dat13

adjustedRandIndex(vote(rf), true_label)
## [1] 0.8475188
stat$dunn
## [1] 0.1620794

8.11.4 dat27_scaled

adjustedRandIndex(vote(rf), true_label)
## [1] 0.8180556
stat$dunn
## [1] 0.1538608

8.12 MI with norm

8.12.1 dat13_scaled

adjustedRandIndex(vote(norm), true_label)
## [1] 0.8008503
stat$dunn
## [1] 0.1620794

8.12.2 dat27_scaled

adjustedRandIndex(vote(norm), true_label)
## [1] 0.7556455
stat$dunn
## [1] 0.192957

8.12.3 dat13

adjustedRandIndex(vote(norm), true_label)
## [1] 0.8475188
stat$dunn
## [1] 0.1620794

8.12.4 dat27

adjustedRandIndex(vote(norm), true_label)
## [1] 0.8174184
stat$dunn
## [1] 0.1830305

8.13 MI with norm.predict

8.13.1 dat13_scaled

adjustedRandIndex(vote(norm.predict), true_label)
## [1] 0.8651097
stat$dunn
## [1] 0.2322567

8.13.2 dat27_scaled

adjustedRandIndex(vote(norm.predict), true_label)
## [1] 0.7406843
stat$dunn
## [1] 0.1538608

8.13.3 dat13

adjustedRandIndex(vote(norm.predict), true_label)
## [1] 0.8651097
stat$dunn
## [1] 0.2322567

8.13.4 dat27

adjustedRandIndex(vote(norm.predict), true_label)
## [1] 0.7406843
stat$dunn
## [1] 0.1538608

8.14 MI with norm.nob

8.14.1 dat13_scaled

adjustedRandIndex(vote(norm.nob), true_label)
## [1] 0.8653209
stat$dunn
## [1] 0.203264

8.14.2 dat27_scaled

adjustedRandIndex(vote(norm.nob), true_label)
## [1] 0.7699792
stat$dunn
## [1] 0.1538608

8.14.3 dat13_scaled

adjustedRandIndex(vote(norm.nob), true_label)
## [1] 0.8486782
stat$dunn
## [1] 0.203264

8.14.4 dat27_scaled

adjustedRandIndex(vote(norm.nob), true_label)
## [1] 0.7401273
stat$dunn
## [1] 0.1737495

8.15 EM & DA

DA 를 이용한 imputation 과 같은 경우에는 초깃값에 따라 영향을 받는 경우가 많아 범위로 보는 것이 더 적절할 것이다. 뒤에서 시뮬레이션을 따로 다루고자 한다.

8.15.1 dat13_scaled

iter = seq(5,50,5)
result_da_13 = matrix(c(rep(0, 50*length(iter))), nrow = 50)
for (i in 1:50){
    for (j in 1:length(iter)){
        rngseed(i)
        s<- prelim.norm(as.matrix(dat13_scaled))
        thetahat <- em.norm(s,showits = F)
        theta <- da.norm(s, thetahat, steps = iter[j], showits =  F)
        #getparam.norm(s, theta)
        ximp <- imp.norm(s, theta, dat13_scaled)
        da<-kmeans(ximp,3);#    S31$cluster
        result_da_13[i,j]=adjustedRandIndex(da$cluster,true_label)
    }
}

8.15.2 dat27_scaled

iter = seq(5,50,5)
result_da_27 = matrix(c(rep(0, 50*length(iter))), nrow = 50)
for (i in 1:50){
    for (j in 1:length(iter)){
        rngseed(i)
        s<- prelim.norm(as.matrix(dat27_scaled))
        thetahat <- em.norm(s,showits = F)
        theta <- da.norm(s, thetahat, steps = iter[j], showits =  F)
        #getparam.norm(s, theta)
        ximp <- imp.norm(s, theta, dat27_scaled)
        da<-kmeans(ximp,3);#    S31$cluster
        result_da_27[i,j]=adjustedRandIndex(da$cluster,true_label)
    }
}

8.15.3 dat13

iter = seq(5,50,5)
result_da_13_nonscale = matrix(c(rep(0, 50*length(iter))), nrow = 50)
for (i in 1:50){
    for (j in 1:length(iter)){
        rngseed(i)
        s<- prelim.norm(as.matrix(dat13))
        thetahat <- em.norm(s,showits = F)
        theta <- da.norm(s, thetahat, steps = iter[j], showits =  F)
        #getparam.norm(s, theta)
        ximp <- imp.norm(s, theta, dat13)
        da<-kmeans(scale(ximp),3);# S31$cluster
        result_da_13_nonscale[i,j]=adjustedRandIndex(da$cluster,true_label)
    }
}

max(result_da_13_nonscale)
## [1] 0.8820585

8.15.4 dat27

iter = seq(5,50,5)
result_da_27_nonscale = matrix(c(rep(0, 50*length(iter))), nrow = 50)
for (i in 1:50){
    for (j in 1:length(iter)){
        rngseed(i)
        s<- prelim.norm(as.matrix(dat27))
        thetahat <- em.norm(s,showits = F)
        theta <- da.norm(s, thetahat, steps = iter[j], showits =  F)
        #getparam.norm(s, theta)
        ximp <- imp.norm(s, theta, dat27)
        da<-kmeans(scale(ximp),3);# S31$cluster
        result_da_27_nonscale[i,j]=adjustedRandIndex(da$cluster,true_label)
    }
}

max(result_da_27_nonscale)
## [1] 0.8992963

9 Simulation

일부 알고리듬의 stability를 보고자 하여 여러 초깃값에 대해 ARI를 산출했다. 가로축은 초깃값을 뜻하고 세로축은 ARI를 뜻한다. 이때 kmeans를 활용하는 경우의 nstart option을 100으로 주었으므로, 초깃값에 따른 변동분은 순전히 imputation에 기인한다고 볼 수 있다고 생각한다.

9.1 EM

9.1.1 dat13_scaled

index =1:100
result_em_13 = data.frame(result_em_13, index)
ggplot(result_em_13, aes(x =index, y = result_em_13))+
    geom_line()+
    geom_point()

9.1.2 dat27_scaled

index =1:100
result_em_27 = data.frame(result_em_27, index)
ggplot(result_em_27, aes(x =index, y = result_em_27))+
    geom_line()+
    geom_point()

9.2 DA

9.2.1 dat13_scaled

iter = seq(5,50,10)
result_da_13 = matrix(c(rep(0, 100*length(iter))), nrow = 100)
for (i in 1:100){
    for (j in 1:length(iter)){
        rngseed(i)
        s<- prelim.norm(as.matrix(dat13_scaled))
        thetahat <- em.norm(s,showits = F)
        theta <- da.norm(s, thetahat, steps = iter[j], showits =  F)
        #getparam.norm(s, theta)
        ximp <- imp.norm(s, theta, dat13_scaled)
        da<-kmeans(ximp,3, nstart = 100);#  S31$cluster
        result_da_13[i,j]=adjustedRandIndex(da$cluster,true_label)
    }
}

da의 steps option = 5

index =1:100
result_da_13 = data.frame(result_da_13, index)
ggplot(result_da_13)+
    geom_line(mapping = aes(x= index, y = X1), color = "red")+
    geom_point(mapping = aes(x= index, y = X1), color = "red")

da의 steps option = 50

index =1:100
result_da_13 = data.frame(result_da_13, index)
ggplot(result_da_13)+
    geom_line(mapping = aes(x= index, y = X5), color = "orange")+
    geom_point(mapping = aes(x= index, y = X5), color = "orange")

9.2.2 dat27_scaled

iter = seq(5,50,10)
result_da_27 = matrix(c(rep(0, 100*length(iter))), nrow = 100)
for (i in 1:100){
    for (j in 1:length(iter)){
        rngseed(i)
        s<- prelim.norm(as.matrix(dat27_scaled))
        thetahat <- em.norm(s,showits = F)
        theta <- da.norm(s, thetahat, steps = iter[j], showits =  F)
        #getparam.norm(s, theta)
        ximp <- imp.norm(s, theta, dat27_scaled)
        da<-kmeans(ximp,3, nstart = 100 );# S31$cluster
        result_da_27[i,j]=adjustedRandIndex(da$cluster,true_label)
    }
}

da의 steps option = 5

index =1:100
result_da_27 = data.frame(result_da_27, index)
ggplot(result_da_27)+
    geom_line(mapping = aes(x= index, y = X1), color = "red")+
    geom_point(mapping = aes(x= index, y = X1), color = "red")

da의 steps option = 50

index =1:100
result_da_27 = data.frame(result_da_27, index)
ggplot(result_da_27)+
    geom_line(mapping = aes(x= index, y = X5), color = "orange")+
    geom_point(mapping = aes(x= index, y = X5), color = "orange")

9.3 Unconditional Hot-Deck

9.3.1 dat13_scaled

result13_hotdeck = data.frame(result13_hotdeck, index)
ggplot(result13_hotdeck)+
    geom_line(mapping = aes(x= index, y = result13_hotdeck))+
    geom_point(mapping = aes(x= index, y = result13_hotdeck))

9.3.2 dat27_scaled

result27_hotdeck = data.frame(result27_hotdeck, index)
ggplot(result27_hotdeck)+
    geom_line(mapping = aes(x= index, y = result27_hotdeck))+
    geom_point(mapping = aes(x= index, y = result27_hotdeck))

9.4 norm.nob

9.4.1 dat13_scaled

result_norm.nob13 = data.frame(result_norm.nob13, index)
ggplot(result_norm.nob13)+
    geom_line(mapping = aes(x= index, y = result_norm.nob13))+
    geom_point(mapping = aes(x= index, y = result_norm.nob13))

9.4.2 dat27_scaled

result_norm.nob27 = data.frame(result_norm.nob27, index)
ggplot(result_norm.nob27)+
    geom_line(mapping = aes(x= index, y = result_norm.nob27))+
    geom_point(mapping = aes(x= index, y = result_norm.nob27))

9.5 KSC

9.5.1 dat13_scaled

코드 돌린 파일 있어서 안돌림 ksc_search_grid13, ksc_search_grid13

result13_ksc = read.csv("/Users/iseunghun/course/결측데이터분석/project2/ksc_search_grid13.csv")[,-1]
result13_ksc_object = read.csv("/Users/iseunghun/course/결측데이터분석/project2/ksc_search_grid13_object.csv")[,-1]
max(result13_ksc)
## [1] 0.8350791

최대 성능은 0.835

weight 파라미터에 따른 adj index(초기치에 대한 영향력을 줄이기 위해 웨이트 마다 10개 초기치에 대해서 평균을 내림 )

mean_adjrandindex = apply(result13_ksc,1,mean)
w = seq(0.2,0.95,by = 0.05)  
temp = data.frame(w, mean_adjrandindex)

ggplot(temp)+
    geom_line(mapping = aes(x= w, y = mean_adjrandindex))+
    geom_point(mapping = aes(x= w, y = mean_adjrandindex))

0.5가 최적의 파라미터란 사실을 알 수 있음

9.5.2 dat27_scaled

코드 돌린 파일 있어서 안돌림

result27_ksc = read.csv("/Users/iseunghun/course/결측데이터분석/project2/ksc_search_grid27.csv")[,-1]
result27_ksc_object = read.csv("/Users/iseunghun/course/결측데이터분석/project2/ksc_search_grid27_object.csv")[,-1]
max(result27_ksc)
## [1] 0.8350791

최대는 0.835

mean_adjrandindex = apply(result27_ksc,1,mean)
w = seq(0.2,0.95,by = 0.05)  
temp = data.frame(w, mean_adjrandindex)

ggplot(temp)+
    geom_line(mapping = aes(x= w, y = mean_adjrandindex))+
    geom_point(mapping = aes(x= w, y = mean_adjrandindex))

w = 0.5가 최적이었다.

9.5.3 loss value 와 ARI간의 관계

KSC 의 경우 k-means 방법을 기본으로 사용하기 때문에 클러스터 centroid 의 initial 값을 어디에 주냐에 따라서 성능이 달라진다. 보통의 k-means는 loss value 가 가장 낮았던 초깃값을 사용하면 되지만 KSC에서는 가장 loss value 가 작았던 초깃값이 ARI가 가장 높은 초깃값과 일치하는지는 밝혀지지 않아 확인해야 할 필요성을 느꼈다.

13% missing의 경우

#ARI
apply(result13_ksc,1, which.max)
##  [1]  8  1  8 10  1  1  1  1  1  7  1  1  1  5  1  1
#loss value
apply(result13_ksc_object,1,which.min)
##  [1]  2  1  1 10  7  5  5  1  1  7  2  1  3  5  2  8

27% missing의 경우

#ARI
apply(result27_ksc,1, which.max)
##  [1]  4  4  5  2 10  1  5  1  1  1  1  1  1  5  1  1
#loss value
apply(result27_ksc_object,1,which.min)
##  [1]  4  1  1  6  1 10  5  1  1  1  7  2  7  9  3  2

확인 결과 꼭 일치하지 않는다는 사실을 알 수 있었다. 게다가 KSC에서는 w 하이퍼 파라미터를 결정해야 하는 문제가 있다.(어떤 값을 쓰는냐에 따라서 성능차이가 심하게 발생한다.) 이론상으로는 결측이 발생한 변수가 클러스터링에 중요할수록 w 를 높이면 되지만 정확한 값을 얻기 어려울 뿐더러 이런 정보가 항상 가능한것도 아니다. 또한 클러스터 레이벨이 있는 데이터셋의 일부만 가지고 w 를 결정하는 것도 가능은 하겠지만, 이는 클러스터링 보다는 분류문제에 가깝지 않나 생각한다.(즉 실용성이 없다.)

9.6 K-pod

9.6.1 13% missing

result_kpod13 = matrix(rep(0,100), ncol = 1)
for (i in 1:100){
    set.seed(i)
    B1<-kpod(as.matrix(dat13_scaled),3,maxiter = 1000)
    result_kpod13[i,] = adjustedRandIndex(B1$cluster, true_label)
}
result_kpod13 = data.frame(result_kpod13, index)
ggplot(result_kpod13)+
    geom_line(mapping = aes(x= index, y = result_kpod13))+
    geom_point(mapping = aes(x= index, y = result_kpod13))

summary(result_kpod13[,1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7136  0.7921  0.7921  0.7772  0.7921  0.7921

9.6.2 27% missing

result_kpod27 = matrix(rep(0,100), ncol = 1)
for (i in 1:100){
    set.seed(i)
    B1<-kpod(as.matrix(dat27_scaled),3)
    result_kpod27[i,] = adjustedRandIndex(B1$cluster, true_label)
}
result_kpod27 = data.frame(result_kpod27, index)
ggplot(result_kpod27)+
    geom_line(mapping = aes(x= index, y = result_kpod27))+
    geom_point(mapping = aes(x= index, y = result_kpod27))

summary(result_kpod27[,1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7787  0.7787  0.7787  0.7787  0.7787  0.7787

9.7 ClustImpute

9.7.1 13% missing

result_clustimpute13 = matrix(rep(0,100), ncol = 1)
for (i in 1:100){
    set.seed(i)
    A1<-ClustImpute(dat13_scaled,3)
    result_clustimpute13[i,] = adjustedRandIndex(A1$cluster, true_label)
}
result_clustimpute13 = data.frame(result_clustimpute13, index)
ggplot(result_clustimpute13)+
    geom_line(mapping = aes(x= index, y = result_clustimpute13))+
    geom_point(mapping = aes(x= index, y = result_clustimpute13))

summary(result_clustimpute13[,1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.8487  0.8487  0.8487  0.8487  0.8487  0.8487

9.7.2 27% missing

result_clustimpute27 = matrix(rep(0,100), ncol = 1)
for (i in 1:100){
    set.seed(i)
    A1<-ClustImpute(dat27_scaled,3)
    result_clustimpute27[i,] = adjustedRandIndex(A1$cluster, true_label)
}
result_clustimpute27 = data.frame(result_clustimpute27, index)
ggplot(result_clustimpute27)+
    geom_line(mapping = aes(x= index, y = result_clustimpute27))+
    geom_point(mapping = aes(x= index, y = result_clustimpute27))

summary(result_clustimpute27[,1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.8487  0.8487  0.8487  0.8487  0.8487  0.8487

imputation에 기반한 방법론들은 imputation 값에 따라서 클러스터링 성능에 편차가 심하게 난다. 그 반면에 사전에 imputation을 짆행하지 않는 방법들의 경우 거의 동일한 클러스터링 성능을 보인다.

10 시각화 예시

인덱스 맞춰주는 함수

group1 = 1:59
group2 = 60:130
group3 = 131:178

unite <- function(result){
    index1 = getmode(result[group1])
    index2 = getmode(result[group2])
    index3 = getmode(result[group3])
    
    ## 레이벨만 바꿔주기
    result[result==index1] = 11
    result[result==index2] = 12
    result[result==index3] = 13
    result = result-10
    return(result)
}

10.1 TRUE

pca = prcomp(dat2_scaled)$x[,c(1,2)]
ho = data.frame(pca,dat$V1)
colnames(ho) = c("PC1","PC2","cluster")

ggplot(ho)+
    geom_point(mapping = aes( x= PC1, y =PC2 , color = as.factor(cluster)))

10.2 mean imputation

10.2.1 13% missing

dat13_scaledm<-complete(mice(dat13_scaled,method="mean",m=1)); 
## 
##  iter imp variable
##   1   1  V7  V8  V12  V13
##   2   1  V7  V8  V12  V13
##   3   1  V7  V8  V12  V13
##   4   1  V7  V8  V12  V13
##   5   1  V7  V8  V12  V13
S21<-kmeans(dat13_scaledm,3,nstart= 10);    #S21$cluster
pca = prcomp(dat2_scaled)$x[,c(1,2)]
ho = data.frame(pca,unite(S21$cluster))
colnames(ho) = c("PC1","PC2","cluster")

ggplot(ho)+
    geom_point(mapping = aes( x= PC1, y =PC2 , color = as.factor(cluster)))

10.2.2 27% missing

dat27_scaledm<-complete(mice(dat27_scaled,method="mean",m=1)); 
## 
##  iter imp variable
##   1   1  V7  V8  V12  V13
##   2   1  V7  V8  V12  V13
##   3   1  V7  V8  V12  V13
##   4   1  V7  V8  V12  V13
##   5   1  V7  V8  V12  V13
S21<-kmeans(dat27_scaledm,3,nstart= 10);    #S21$cluster
pca = prcomp(dat2_scaled)$x[,c(1,2)]
ho = data.frame(pca,unite(S21$cluster))
colnames(ho) = c("PC1","PC2","cluster")

ggplot(ho)+
    geom_point(mapping = aes( x= PC1, y =PC2 , color = as.factor(cluster)))

10.3 clustimpute

10.3.1 13% missing

A1<-ClustImpute(dat13_scaled,3)
pca = prcomp(dat2_scaled)$x[,c(1,2)]
ho = data.frame(pca,unite(A1$cluster))
colnames(ho) = c("PC1","PC2","cluster")

ggplot(ho)+
    geom_point(mapping = aes( x= PC1, y =PC2 , color = as.factor(cluster)))

10.3.2 27% missing

A1<-ClustImpute(dat27_scaled,3)
pca = prcomp(dat2_scaled)$x[,c(1,2)]
ho = data.frame(pca,unite(A1$cluster))
colnames(ho) = c("PC1","PC2","cluster")

ggplot(ho)+
    geom_point(mapping = aes( x= PC1, y =PC2 , color = as.factor(cluster)))

10.4 PMM

10.4.1 13% missing

ggplot(ho)+
    geom_point(mapping = aes( x= PC1, y =PC2 , color = as.factor(cluster)))

10.4.2 27% missing

ggplot(ho)+
    geom_point(mapping = aes( x= PC1, y =PC2 , color = as.factor(cluster)))

11 imputation 체크

성능이 좋았던 imputation 방법과 안좋았던 imputation 방법의 bias 가 어떻게 나타나는지를 보고자 함.

11.1 Bias check (그룹을 고려하지 않음)

centering 했기 때문에 true 값은 0 이 나올것

11.1.1 mean

11.1.1.1 13% missing

mean(dat13_scaledm$V7) 
## [1] -1.026994e-16
mean(dat13_scaledm$V8)
## [1] 1.675701e-16
mean(dat13_scaledm$V12)
## [1] -1.936787e-16
mean(dat13_scaledm$V13)
## [1] -3.761207e-17

11.1.1.2 27% missing

mean(dat27_scaledm$V7) 
## [1] -2.645057e-16
mean(dat27_scaledm$V8)
## [1] 6.731921e-17
mean(dat27_scaledm$V12)
## [1] -2.005335e-16
mean(dat27_scaledm$V13)
## [1] -1.762718e-16

11.1.2 KNN - 15

11.1.2.1 13% missing

mean(dat13_scaled15$V7) 
## [1] -0.01568219
mean(dat13_scaled15$V8)
## [1] -0.01012611
mean(dat13_scaled15$V12)
## [1] -0.02413975
mean(dat13_scaled15$V13)
## [1] -0.007041984

11.1.2.2 27% missing

mean(dat27_scaled15$V7) 
## [1] -0.0159088
mean(dat27_scaled15$V8)
## [1] 0.002686998
mean(dat27_scaled15$V12)
## [1] -0.02312645
mean(dat27_scaled15$V13)
## [1] 0.01167743

11.1.3 Unconditional

11.1.3.1 13% missing

mean(dat13_scaledh$V7) 
## [1] 0.007278275
mean(dat13_scaledh$V8)
## [1] 0.0007385788
mean(dat13_scaledh$V12)
## [1] 0.01494679
mean(dat13_scaledh$V13)
## [1] 0.00112046

11.1.3.2 27% missing

mean(dat27_scaledh$V7) 
## [1] -0.01848199
mean(dat27_scaledh$V8)
## [1] 0.02449277
mean(dat27_scaledh$V12)
## [1] -0.02031891
mean(dat27_scaledh$V13)
## [1] 0.0425121

11.1.4 pmm

11.1.4.1 13% missing

V7

impu = 0 
for (i in 1:10){
impu = impu + mean(complete(pmm13,i)$V7)
}
impu
## [1] 0.1326244

V8

impu = 0 
for (i in 1:10){
impu = impu + mean(complete(pmm13,i)$V8)
}
impu
## [1] 0.1853244

V12

impu = 0 
for (i in 1:10){
impu = impu + mean(complete(pmm13,i)$V12)
}
impu
## [1] 0.07837746

V13

impu = 0 
for (i in 1:10){
impu = impu + mean(complete(pmm13,i)$V13)
}
impu
## [1] -0.06703638

11.2 Bias check 그룹 별로

11.2.1 13%

11.2.1.1 V7

true- m
##   V1        mean
## 1  0  0.08667527
## 2  0  0.09687105
## 3  0 -0.24982678
true- knn15
##   V1        mean
## 1  0  0.01088159
## 2  0  0.06770110
## 3  0 -0.05536171
true - hot
##   V1        mean
## 1  0  0.09247370
## 2  0  0.05079736
## 3  0 -0.21579363
true[,2] - pmm
##           mean
## 1 -0.017769185
## 2 -0.005051854
## 3 -0.019867709

11.2.1.2 V8

true- m
##   V1        mean
## 1  0  0.10910734
## 2  0  0.08639061
## 3  0 -0.26189721
true- knn15
##   V1        mean
## 1  0  0.01785856
## 2  0  0.04789199
## 3  0 -0.05524040
true - hot
##   V1       mean
## 1  0  0.1104413
## 2  0  0.1101193
## 3  0 -0.3013746
true[,2] - pmm
##           mean
## 1  0.006408273
## 2 -0.031407916
## 3 -0.030143744

11.2.1.3 V12

true- m
##   V1        mean
## 1  0  0.08744112
## 2  0  0.06440151
## 3  0 -0.20274028
true- knn15
##   V1        mean
## 1  0  0.05406414
## 2  0  0.03009380
## 3  0 -0.02144935
true - hot
##   V1       mean
## 1  0  0.1128722
## 2  0  0.0170909
## 3  0 -0.2194467
true[,2] - pmm
##          mean
## 1  0.04600703
## 2 -0.01730175
## 3 -0.06002310

11.2.1.4 V13

true- m
##   V1        mean
## 1  0  0.10658423
## 2  0  0.07686309
## 3  0 -0.24470310
true- knn15
##   V1        mean
## 1  0  0.04864109
## 2  0  0.01561124
## 3  0 -0.05676560
true - hot
##   V1        mean
## 1  0  0.11344062
## 2  0  0.08167337
## 3  0 -0.26440100
true[,2] - pmm
##           mean
## 1  0.042386005
## 2 -0.014768742
## 3 -0.005394709

11.2.2 27% missing

11.2.3 V7

true-m
##   V1       mean
## 1  0  0.1860152
## 2  0  0.1007617
## 3  0 -0.3776870
true-knn15
##   V1        mean
## 1  0  0.04017108
## 2  0  0.08235247
## 3  0 -0.11219485
true-hot
##   V1       mean
## 1  0  0.2094141
## 2  0  0.1466471
## 3  0 -0.4057830
true[,2]-pmm #pmm
##          mean
## 1  0.01491486
## 2 -0.01320186
## 3 -0.00200316

11.2.4 V8

true-m
##   V1        mean
## 1  0  0.22061726
## 2  0  0.09301439
## 3  0 -0.40875917
true-knn15
##   V1        mean
## 1  0  0.05941421
## 2  0  0.02856339
## 3  0 -0.12524427
true-hot
##   V1        mean
## 1  0  0.30228609
## 2  0  0.02684983
## 3  0 -0.50210273
true[,2]-pmm #pmm
##          mean
## 1  0.03966937
## 2 -0.09766230
## 3 -0.05193020

11.2.5 V12

true-m
##   V1       mean
## 1  0 -0.2760183
## 2  0  0.4737709
## 3  0 -0.3615137
true-knn15
##   V1        mean
## 1  0  0.04579164
## 2  0  0.08184790
## 3  0 -0.09159167
true-hot
##   V1       mean
## 1  0  0.1937087
## 2  0  0.1254466
## 3  0 -0.3483074
true[,2]-pmm #pmm
##            mean
## 1 -0.0181769789
## 2 -0.0003994343
## 3 -0.1233341437

11.2.6 V13

true-m
##   V1       mean
## 1  0  0.1908502
## 2  0  0.1410300
## 3  0 -0.4431936
true-knn15
##   V1        mean
## 1  0  0.04988614
## 2  0  0.02343661
## 3  0 -0.13928882
true-hot
##   V1       mean
## 1  0  0.2478954
## 2  0  0.0542209
## 3  0 -0.5425555
true[,2]-pmm #pmm
##           mean
## 1  0.064558642
## 2 -0.001638195
## 3 -0.096772225

단순한 알고리듬일수록, 그리고 cluster3,2,1 으로 갈수록 cluster centroid 의 bias 가 심해진다는 것을 알 수 있었다.